QC across assays & MAE augmentation

Author

Laura Symul

Published

May 27, 2025

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)

tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Loading the MultiAssayExperiment object

Code
data_source <- "real"
mae_files <- 
  fs::dir_ls(
    str_c(
      get_data_dir(data_source = data_source),  
      "02 MAEs/"
    ),
    regexp = "mae_full_.*\\.rds$"
  ) |> 
  sort(decreasing = TRUE) |>
  magrittr::extract(1)

mae <- readRDS(mae_files)

rm(mae_files)

Cross-assay QC

Comparing qPCR and metagenomics

Code
tmp <- 
  dplyr::inner_join(
    mae[["mg"]] |> 
      as_tibble() |>
      filter(!is.na(LBP)) |> 
      filter(!is.na(rel_abs_bact)) |> 
      select(.feature, .sample, rel_abs_bact, sample_type) |> 
      dplyr::rename(
        mg_rel_ab_bact = rel_abs_bact
      ),
    mae[["qPCR"]] |>
      as_tibble() |>
      filter(!is.na(LBP)) |> 
      select(.feature, .sample, copies_per_swab_med, copies_per_swab_cv, strain_group_qpcr),
    by = join_by(.feature, .sample)
  )

tmp |> 
  ggplot() +
  aes(x = mg_rel_ab_bact, y = copies_per_swab_med, color = sample_type) +
  geom_point(alpha = 0.3, size = 1) +
  facet_wrap(~ strain_group_qpcr + .feature) +
  scale_y_log10() +
  scale_x_log10()

Code
tmp |> 
  dplyr::count(.feature, mg_rel_ab_bact > 0, copies_per_swab_med > 0) |> 
  ggplot() +
  aes(x = `mg_rel_ab_bact > 0`, y = `copies_per_swab_med > 0`) +
  geom_tile(aes(fill = n)) +
  geom_text(aes(label = n)) +
  scale_fill_gradient(low = "white", high = "dodgerblue3") +
  facet_wrap(~ .feature) 

Code
tmp |> 
  dplyr::count(.feature, mg_rel_ab_bact > 0, copies_per_swab_med > 1e+05) |> 
  ggplot() +
  aes(x = `mg_rel_ab_bact > 0`, y = `copies_per_swab_med > 1e+05`) +
  geom_tile(aes(fill = n)) +
  geom_text(aes(label = n)) +
  scale_fill_gradient(low = "white", high = "dodgerblue3") +
  facet_wrap(~ .feature) 

Code
tmp |> 
  dplyr::count(.feature, mg_rel_ab_bact > 0, copies_per_swab_med > 10000) |> 
  ggplot() +
  aes(x = `mg_rel_ab_bact > 0`, y = `copies_per_swab_med > 10000`) +
  geom_tile(aes(fill = n)) +
  geom_text(aes(label = n)) +
  scale_fill_gradient(low = "white", high = "dodgerblue3") +
  facet_wrap(~ .feature) 

TODO sensitivity-specificity curves for each LBP

Metagenomics and 16S rRNA amplicon sequencing

TODO

Temporal profiles (for 10 “random” participants)

Weekly profiles

Code
weekly_samples_from_randomized_participants <- 
  dplyr::inner_join(
    tibble(uid = mae[["mg"]]$uid),
    tibble(uid = mae[["amplicon"]]$uid),
    by = join_by(uid)
  ) |> 
  dplyr::inner_join(
    mae@colData |> as.data.frame() |> as_tibble() |> 
      filter(randomized) |> 
      select(uid, pid, visit_code, randomized, arm, visit_type),
    by = join_by(uid)
  ) |> 
  filter(visit_type == "Clinic")
  

mae_sub <- mae[, weekly_samples_from_randomized_participants$uid]
Code
random_pids <- mae_sub$pid |> table() |> sort(decreasing = TRUE) |> names() |> head(10)
Code
map(
  random_pids,
  plot_data_for_pid,
  mae_sub = mae_sub
)
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]


[[6]]


[[7]]


[[8]]


[[9]]


[[10]]

Full profiles

Code
all_samples_from_randomized_participants <- 
  dplyr::full_join(
    tibble(uid = mae[["mg"]]$uid),
    tibble(uid = mae[["amplicon"]]$uid),
    by = join_by(uid)
  ) |> 
  dplyr::inner_join(
    mae@colData |> as.data.frame() |> as_tibble() |> 
      filter(randomized) |> 
      select(uid, pid, visit_code, randomized, arm),
    by = join_by(uid)
  )

mae_sub <- mae[, all_samples_from_randomized_participants$uid]
Code
# random_pids <- mae_sub$pid |> table() |> sort(decreasing = TRUE) |> names() |> head(10)
Code
map(
  random_pids,
  plot_data_for_pid,
  mae_sub = mae_sub
)
[[1]]


[[2]]


[[3]]


[[4]]


[[5]]


[[6]]


[[7]]


[[8]]


[[9]]


[[10]]

Participant groups

“Guessing the arms” based on the MG and qPCR data

  • if at least > 3% of any LBP115-specific strains or > 10% of all LBP115-specific strains in the MG data at any time-point -> “115”

  • if in group 4 -> overlap arm

  • if at least 3% of any LBP106 strains or > 10% of all LBP106 strains in the qPCR data at any time-point -> “106 3 or 7”

  • otherwise -> placebo

TODO

Primary outcomes

Primary outcome definition

The primary outcome is “colonization with any of the L. crispatus strains contained in the LBP by 5 weeks of follow-up as assessed by metagenomic sequencing of the vaginal microbial community with detection of any one of LBP strains at >5% relative abundance or a combination of the strains accounting for >10% relative abundance by metagenomics.”

“LBP colonization” at each visit

We compute, at each visit and for each participant, the total relative abundance of all LBP strains or the maximum relative abundance of any LBP strain.

If the total relative abundance of all LBP strains is larger than 0.1 or the maximum relative abundance of any LBP strain is larger than 0.05, we consider that the LBP achieved colonization at that visit.

Code
colonization <- 
  mae[["mg"]] |> 
  as_tibble() |>
  filter(sample_type == "Clinical sample") |> 
  filter(!is.na(LBP)) |> 
  group_by(.sample) |> 
  summarize(
    n = n(),
    total_LBP_rel_ab_at = sum(rel_abs_bact),
    max_LBP_rel_ab_at = max(rel_abs_bact),
    n_detected_strains_at = sum(rel_abs_bact > 0),
    n_detected_LC106_strains_at = sum(rel_abs_bact[LBP == "LC-106 & LC-115"] > 0),
    n_detected_LC115_only_strains_at = sum(rel_abs_bact[LBP == "LC-115"] > 0)
  ) |> 
  mutate(
    colonized_at = (total_LBP_rel_ab_at > 0.1) | (max_LBP_rel_ab_at > 0.05)
  ) 
Code
colonization <-  
  colonization |> 
  dplyr::left_join(
    mae@colData |> as_tibble() |> 
      select(uid, pid, visit_code, randomized, arm, location) |> 
      dplyr::rename(.sample = uid), 
    by = join_by(.sample)
  ) |> 
  arrange(pid, visit_code) 
Code
colonization |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_at) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

“LBP colonization” by each visit

From the colonization status at each visit, we can compute the colonization status by each visit.

A participant is considered to have colonized by a visit if they have been colonized at that visit or any previous visit, starting from their post-product visit (“1200” for all groups).

Since we have some missing visits, we impute these as “not colonized”

Code
colonization <- 
  colonization |> 
  select(-c(arm, randomized, location)) |> 
  dplyr::full_join(
    colonization |> select(pid, arm, randomized, location) |> distinct() |> 
      expand_grid(
        visit_code = unique(colonization$visit_code) |> sort()
      ),
    by = join_by(pid, visit_code)
  )
Code
colonization |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_at) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
colonization <- 
  colonization |> 
  arrange(pid, visit_code) |>
  mutate(colonized_at = colonized_at |> replace_na(FALSE)) |>
  group_by(pid) |> 
  mutate(
    tmp = ifelse(as.numeric(visit_code) < 1200, FALSE, colonized_at),
    colonized_by = cummax(tmp) |> as.logical()
  ) |> 
  ungroup() |> 
  select(-tmp)
Code
colonization |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_by) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Adding primary_outcomes assay to MAE

Code
se_primary <-
  SummarizedExperiment(
    assays = list(
      outcome = 
        colonization |> 
        # filter(!is.na(pid)) |> 
        # mutate(.sample = str_c(pid, "_", visit_code)) |> 
        filter(!is.na(.sample)) |> 
        select(.sample, colonized_at, colonized_by) |> 
        as.data.frame() |> 
        column_to_rownames(".sample") |> t()
      )
  )

mae <- c(mae, list(primary_outcomes = se_primary))

Saving the MultiAssayExperiment objects

We save the “master” MAE (with all assays); but if needed for publication, we can subset the MAE to only include the assays/data of interest.

Code
saveRDS(
  mae, 
  str_c(
    get_data_dir(data_source = data_source),  
    "03 QCed MAEs/",
    "mae_full_", today() |> str_remove_all("-"), ".rds"
  )
)